import os
import sys
import requests
import json
import re
from lxml import etree

cwd = os.getcwd()
sys.path.append(cwd)
print(cwd)

from pub_func import file_write, folder_exist

base_url = 'http://soso.nipic.com/'
headers = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
}
params = {
    'q':'雨',
    'y':'100', # 每页显示100张图片
    'page':'1',
    'k':'2',
}

if __name__ == '__main__':
    # 需要搜索的关键字列表
    cloud_type = ['絮状高积云','透光高积云','荚状高积云','积云性高积云','蔽光高积云',
    '堡状高积云','透光高层云','蔽光高层云','伪卷云','密卷云',
    '毛卷云','钩卷云','卷积云','匀卷层云','毛卷层云',
    '雨层云','碎雨云','碎积云','浓积云','淡积云',
    '鬃积雨云','秃积雨云','碎层云','层云','透光层积云',
    '荚状层积云','积云性层积云','蔽光层积云','堡状层积云',]

    base_dir = '/home/ubuntu/workspace/cloud_type' # 存放csv的文件夹
    folder_exist(base_dir)

    for cloud_index in range(len(cloud_type)):
        word = cloud_type[cloud_index]
        params['q'] = word
        count = 0
        for i in range(1, 501):
            params['page'] = i
            response = requests.get(url=base_url, headers=headers, params=params)
            response.encoding = 'utf8'
            # print(response.text)
            html = etree.HTML(response.text)
            list_img_url = html.xpath('//*[@id="img-list-outer"]/li/a/img/@data-original')
            list_img_alt = html.xpath('//*[@id="img-list-outer"]/li/a/img/@alt')
            total = html.xpath('//*[@id="left-imgList-img"]/div/div[2]/div/div[3]/@total')[0]
            print(list_img_url, len(list_img_url), total)
            
            if int(total) <= i*100:
                break

            file_path = os.path.join(base_dir, 'nipic_'+str(cloud_index)+'.csv')
            for img_index in range(len(list_img_url)):
                count += 1
                img_name = str(cloud_index+1) + '_' + str(count) + '.jpg'
                content = img_name + ',' + list_img_url[img_index] + ',' + list_img_alt[img_index] + '\n'
                file_write(file_path, content, 'a')
